R Markdown

if (!require("knitr")) install.packages("knitr")
if (!require("SummarizedExperiment")) install.packages("SummarizedExperiment")
if (!require("edgeR")) install.packages("edgeR")
if (!require("tidyr")) install.packages("tidyr")
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("scales")) install.packages("scales")
if (!require("plotly")) install.packages("plotly")

source("functions.R")
# VARIABLES

data_samples <- params$data_samples
data_counts <- params$data_counts
data_annotation <- params$data_annotation
method <- params$method
excluded_samples <- params$excluded_samples
design_value <- params$design_value
matrix_value <- params$matrix_value
alpha <- params$alpha

method
## [1] "voom"
excluded_samples
## character(0)
design_value
## [1] "~0 + group"
matrix_value
## [1] "cALCL - CD4"
alpha
## [1] 0.01
#READ DATA INTO VARIABLE

#data_samples <- read.csv("~/Desktop/final_310/meta2.csv", row.names=1, header = TRUE, sep = ",")

#data_counts <- read.csv("~/Desktop/final_310/all_samples.fragments_per_gene.tsv", row.names=1, header = TRUE, sep = "\t")
#data_counts <- data_counts[!grepl('^__', rownames(data_counts)),]

#data_annotation <- read.csv("~/Desktop/final_310/gene_annotation.tsv", row.names=1, header = TRUE, sep = "\t")
#READ FILES INTO SE (function: alignment)

se <- readCountsFromTable(data_counts)

alignmentSummaryPlot(se)
complexityPlot(se)
#READ FILES INTO SE (function: alignment)

se <- addSamplesFromTableToSE(se, data_samples)
se <- addAnnotationsFromTableToSE(se, data_annotation)
#GET DGE

dge <- DGEList(counts = assay(se), samples = colData(se), genes = rowData(se))
row.names(dge$genes) <- row.names(dge$counts)

dge <- dge[ rowSums( abs( dge$counts ) ) > 1, ]

countDistributionLinePlot(dge)
#GET HIGH EXPRESSED FEATURES (function: raw_data)

#? Szymon how to determine which method to use
#p310 uses edger, for edger analysis
#p306 uses else, for edger and voom/limma analysis

if (method == "edger") {
  edger <- calcNormFactors( dge, method = "TMM")
  counts <- cpm(edger, log = TRUE)
  selectedFeatures <- rownames( edger )[ apply( counts, 1, function( v ) sum( v >= 1 ) ) >= 1/4 * ncol( counts ) ]

} else {
  #selectedFeatures <- filterByExpr(dge, model.matrix(eval(parse(text=design_value)), dge$samples ))
  selectedFeatures <- ( rowSums( cpm( dge ) > 10 ) >= 3 )
}
#GET HIGH EXPRESSED FEATURES (function: raw_data)

highExprDge <- dge[ selectedFeatures,, keep.lib.sizes = FALSE ]
# NORMALISE DATA (function: norm_data)

normDge <- calcNormFactors( highExprDge, method = "TMM")
# FILTER SAMPLES IF NECESSARY

if (!is.null(excluded_samples)) {
  normDge$counts <- normDge$counts[,!colnames(normDge$counts) %in% excluded_samples]
  data_samples <- data_samples[!rownames(data_samples) %in% excluded_samples, ]
  data_samples <- droplevels(data_samples)
  
  se <- addSamplesFromTableToSE(se, data_samples)
  
  tempDge <- DGEList(counts = normDge$counts, samples = colData(se), genes = normDge$genes)
  tempDge <- calcNormFactors( tempDge, method = "TMM")
  normDge <- tempDge
}

tempDge <- normDge
tempDge$counts <- 2^(cpm(normDge, log = TRUE))
countDistributionLinePlot(tempDge)
variancePcaPlot(tempDge)
# CREATE DESIGN

design <- model.matrix(eval(parse(text=design_value)), normDge$samples)
design
##           groupcALCL groupCD4
## KT_1               1        0
## KT_2               1        0
## KT_3               1        0
## KT_4               1        0
## KT_5               1        0
## KT_6               1        0
## KT_7               1        0
## KT_8               1        0
## KT_9               1        0
## KT_10              1        0
## KT_11              1        0
## KT_12              1        0
## ERR431582          0        1
## ERR431598          0        1
## ERR431614          0        1
## ERR431620          0        1
## ERR431626          0        1
## ERR431575          0        1
## ERR431576          0        1
## ERR431588          0        1
## ERR431603          0        1
## ERR431616          0        1
## ERR431571          0        1
## ERR431574          0        1
## ERR431580          0        1
## ERR431581          0        1
## ERR431610          0        1
## ERR431595          0        1
## ERR431597          0        1
## ERR431604          0        1
## ERR431618          0        1
## ERR431625          0        1
## ERR431570          0        1
## ERR431577          0        1
## ERR431584          0        1
## ERR431587          0        1
## ERR431622          0        1
## ERR431566          0        1
## ERR431579          0        1
## ERR431600          0        1
## ERR431615          0        1
## ERR431628          0        1
## ERR431583          0        1
## ERR431589          0        1
## ERR431601          0        1
## ERR431606          0        1
## ERR431609          0        1
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
#moet dit in een specifieke volgorde?: (~0 + dex + celltype - VS - ~0 + celltype + dex)
for (design_column in colnames(design)) {
  for (sample_column in colnames(normDge$samples)) {
    if (grepl(sample_column, design_column)) {
      colnames(design) <- sub(sample_column, "", colnames(design))
    }
  }
}

matrix_value <- strsplit(matrix_value, ",")[[1]]
contr.matrix <- makeContrasts(contrasts = matrix_value, levels=design)
contr.matrix
##        Contrasts
## Levels  cALCL - CD4
##   cALCL           1
##   CD4            -1
# FIT

if (method == "edger") {
  #normDge <- estimateGLMCommonDisp(normDge, design)
  #normDge <- estimateGLMTrendedDisp(normDge, design)
  #normDge <- estimateGLMTagwiseDisp(normDge, design)
  normDge <- estimateDisp(normDge, design)
  vfit <- glmFit(normDge, design)
  efit <- glmLRT(vfit, contrast = contr.matrix)
  
} else {
  voom <- voom(normDge, design)
  vfit <- lmFit(voom)
  vfit <- contrasts.fit(vfit, contrasts = contr.matrix)
  efit <- eBayes(vfit)
  
}

efit$DE <- decideTests(efit, p.value = alpha)
summary(efit$DE)
##        cALCL - CD4
## Down          6564
## NotSig        3459
## Up            5777
# CREATE deTab TABLE

if (method == "edger") {
  deTab <- topTags(efit, n=Inf)$table
  deTab <- deTab[order(rownames(deTab)),]
  deTab$DE <- efit$DE
  deTab$avgLog2CPM <- efit$table$logCPM
  deTab$avgLog2FC <- efit$table$logFC
  deTab$adj.P.Val <- p.adjust(deTab$PValue, method = "BH")
  colnames(deTab)[colnames(deTab)=="PValue"] <- "P.Value"
  
} else {
  deTab <- data.frame(topTable(efit, coef=1, n=Inf))
  deTab <- deTab[order(rownames(deTab)),]
  deTab$DE <- efit$DE
  colnames(deTab)[colnames(deTab)=="logFC"] <- "avgLog2FC"
  colnames(deTab)[colnames(deTab)=="AveExpr"] <- "avgLog2CPM"
}

maPlot(deTab)
pValuePlot(deTab)
# GET DE GENES

allGenes <- deTab
allGenes <- allGenes[order(rank(allGenes$adj.P.Val)),]
  
DEG <- allGenes[allGenes$adj.P.Val < alpha,]

#EXTRA FILTER LOGFC
DEG <- DEG[which(DEG$avgLog2FC < -2 | DEG$avgLog2FC > 2),]
up <- DEG[which(DEG$DE == 1),]
down <- DEG[which(DEG$DE == -1),]
deTab <- deTab[which(deTab$avgLog2FC < -2 | deTab$avgLog2FC > 2),]
# ENRICHMENT ANALYSIS

save(efit, file="efit.RData")
# SAVE ANALYSIS

normDge$counts <- 2^(cpm(normDge, log = TRUE))
save(deTab, normDge, allGenes, DEG, file="analysis.RData")
#INFO

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=nl_NL.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=nl_NL.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=nl_NL.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=nl_NL.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] plotly_4.9.0                scales_1.0.0               
##  [3] ggplot2_3.2.1               tidyr_1.0.0                
##  [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0        
##  [7] BiocParallel_1.18.1         matrixStats_0.55.0         
##  [9] Biobase_2.44.0              GenomicRanges_1.36.1       
## [11] GenomeInfoDb_1.20.0         IRanges_2.18.3             
## [13] S4Vectors_0.22.1            BiocGenerics_0.30.0        
## [15] knitr_1.25                  shinyjs_1.0                
## [17] shinyWidgets_0.4.9          shinydashboard_0.7.1       
## [19] shiny_1.4.0.9000            edgeR_3.26.8               
## [21] limma_3.40.6               
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.1             splines_3.6.1          viridisLite_0.3.0     
##  [4] bit64_0.9-7            jsonlite_1.6           assertthat_0.2.1      
##  [7] blob_1.2.0             GenomeInfoDbData_1.2.1 yaml_2.2.0            
## [10] pillar_1.4.2           RSQLite_2.1.2          backports_1.1.5       
## [13] lattice_0.20-38        glue_1.3.1             digest_0.6.22         
## [16] RColorBrewer_1.1-2     promises_1.1.0         XVector_0.24.0        
## [19] colorspace_1.4-1       htmltools_0.4.0        httpuv_1.5.2          
## [22] Matrix_1.2-17          pkgconfig_2.0.3        zlibbioc_1.30.0       
## [25] purrr_0.3.3            xtable_1.8-4           GO.db_3.8.2           
## [28] later_1.0.0            tibble_2.1.3           DT_0.9                
## [31] withr_2.1.2            lazyeval_0.2.2         magrittr_1.5          
## [34] crayon_1.3.4           mime_0.7               memoise_1.1.0         
## [37] evaluate_0.14          data.table_1.12.6      rsconnect_0.8.15      
## [40] tools_3.6.1            org.Hs.eg.db_3.8.2     lifecycle_0.1.0       
## [43] stringr_1.4.0          munsell_0.5.0          locfit_1.5-9.1        
## [46] AnnotationDbi_1.46.1   packrat_0.5.0          compiler_3.6.1        
## [49] rlang_0.4.1            grid_3.6.1             RCurl_1.95-4.12       
## [52] rstudioapi_0.10        htmlwidgets_1.5.1      crosstalk_1.0.0       
## [55] bitops_1.0-6           rmarkdown_1.16         gtable_0.3.0          
## [58] DBI_1.0.0              R6_2.4.0               dplyr_0.8.3           
## [61] fastmap_1.0.1          bit_1.1-14             zeallot_0.1.0         
## [64] stringi_1.4.3          Rcpp_1.0.2             vctrs_0.2.0           
## [67] tidyselect_0.2.5       xfun_0.10